%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import cm
import ipyparallel as ipp
from time import time
from datetime import datetime
import motif as mf
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from scipy.stats import spearmanr
from scipy.stats import pearsonr
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
### set parameters for the motif analysis
PROTEIN_NAME = 'Mlx'
PROT_CONC = 0.1 # free protein concentration at binding reation; PBM typically 0.1 and RNACompete typically 0.002
BOTH_STRANDS = True # wheter both strands are present for binding; True if double-stranded DNA or RNA is used as probes
#TIME_DISS = 1800 # experimental time span after binding reaction during which dissociation of the protein from the probe was possible
STAGES=mf.stage(protein=PROTEIN_NAME)
### read data
## RNAcompete sample data
#dfprobes_raw=pd.read_excel('./data/RNAcompete/A2BP1.xlsx')
#dfprobes_raw=pd.read_excel('./data/RNAcompete/HNRNPA1.xlsx')
#dfprobes_raw=pd.read_excel('./data/RNAcompete/PTBP1.xlsx')
#dfprobes_raw=pd.read_excel('./data/RNAcompete/RBM24.xlsx')
dfprobes_raw=pd.read_csv('./data/samplePBMs/Mlx__pTH2882_HK.raw', sep='\t')
#dfprobes_raw=pd.read_csv('./data/samplePBMs/Klf9__pTH2353_HK.raw', sep='\t')
#dfprobes_raw=pd.read_csv('./data/samplePBMs/Prdm11__pTH3455_HK.raw', sep='\t')
#dfprobes_raw=pd.read_csv('./data/samplePBMs/Sox10__pTH1729_HK.raw', sep='\t')
print('Columns of imported Data File: %s' % dfprobes_raw.columns)
#dfprobes_raw.describe()
#dfprobes_raw.info()
Columns of imported Data File: Index(['#id_spot', 'row', 'col', 'control', 'id_probe', 'pbm_sequence',
'linker_sequence', 'mean_signal_intensity', 'mean_background_intensity',
'flag'],
dtype='object')
### select columns for probe sequence and signal
column_sequence = 'pbm_sequence'
column_signal = 'mean_signal_intensity'
background_signal = 'mean_background_intensity' #set to None if not needed
#background_signal=None
#basic preprocessing
dfprobes_raw[column_signal] = dfprobes_raw[column_signal].apply(
lambda a: np.NaN if a == ' ' else a)
dfprobes_raw[column_signal] = dfprobes_raw[column_signal].apply(
lambda a: np.NaN if a == '' else a)
dfprobes_raw[column_sequence] = dfprobes_raw[column_sequence].apply(
lambda a: np.NaN if str(a).lower() == 'nan' else a)
dfprobes_raw[column_sequence] = dfprobes_raw[column_sequence].apply(
lambda a: np.NaN if a == '' else a)
dfprobes_raw = dfprobes_raw.dropna()
#construct new dataframe with only necessary data
if type(background_signal) == type(None):
dfprobes = pd.DataFrame({
'seq':
dfprobes_raw[column_sequence].astype(str),
'signal binding':
dfprobes_raw[column_signal].astype(np.float32)
}) #rebuild dataframe
else:
dfprobes = pd.DataFrame({
'seq':
dfprobes_raw[column_sequence].astype(str),
'signal':
dfprobes_raw[column_signal].astype(np.float32),
'background':
dfprobes_raw[background_signal].astype(np.float32)
}) #rebuild dataframe
dfprobes['signal binding'] = dfprobes['signal'] - dfprobes['background']
dfprobes = dfprobes.dropna()
# display main properties of data set
dfprobes['signal binding'].plot(figsize=(15, 5))
dfprobes.describe()
### check type of nucleic acid
dfprobes['seq'] = dfprobes['seq'].apply(
lambda seq: seq.upper().replace(" ", "")) #upper and remove blanks
dfprobes['RNA'] = dfprobes['seq'].apply(
lambda seq: all(char in 'ACGU' for char in seq))
dfprobes['DNA'] = dfprobes['seq'].apply(
lambda seq: all(char in 'ACGT' for char in seq))
non_RNA_counts = len(dfprobes[dfprobes['RNA'] == False])
non_DNA_counts = len(dfprobes[dfprobes['DNA'] == False])
if non_RNA_counts < non_DNA_counts:
NUC_TYPE = 'RNA'
print('I: RNA probes detected!')
else:
NUC_TYPE = 'DNA'
print('I: DNA probes detected!')
if NUC_TYPE == 'RNA' and non_RNA_counts != 0:
print(
'E: The probe sequences appear to be RNA, however there are some non-RNA nucleotides in the sequences.'
)
print('E: Please check the following sequnces %s' %
dfprobes[dfprobes['RNA'] == False])
if NUC_TYPE == 'DNA' and non_DNA_counts != 0:
print(
'E: The probe sequences appear to be RNA, however there are some non-RNA nucleotides in the sequences.'
)
print('E: Please check the following sequnces %s' %
dfprobes[dfprobes['DNA'] == False])
I: DNA probes detected!
### option to add a constant sequence at the 3' end and 5' end
sequence_to_be_added_5 = ''
sequence_to_be_added_3 = 'CCTGT' # standard PBM arrays: CCTGTGTGAAATTGTTATCCGCTCT T7 array: GTCTTGA..
dfprobes['seq'] = sequence_to_be_added_5.upper(
) + dfprobes['seq'] + sequence_to_be_added_3.upper()
print(
f"I: The nucleotide sequence {sequence_to_be_added_5.upper()} has been added to the 5' end all probe sequences."
)
print(
f"I: The nucleotide sequence {sequence_to_be_added_3.upper()} has been added to the 3' end all probe sequences."
)
I: The nucleotide sequence has been added to the 5' end all probe sequences. I: The nucleotide sequence CCTGT has been added to the 3' end all probe sequences.
### egalize length
dfprobes['seq_length'] = dfprobes['seq'].apply(len)
if max(dfprobes['seq_length']) != min(dfprobes['seq_length']):
print('I: Probes length is not uniform, detected range: %i ..%i' %
(min(dfprobes['seq_length']), max(dfprobes['seq_length'])))
max_length = max(dfprobes['seq_length'])
dfprobes['padded_sequence'] = dfprobes['seq'].apply(
lambda seq: seq + ((max_length - len(seq)) * '-'))
print(
"I: Probe sequences have been padded at the 5' to the uniform length of %i nucleotides"
% max_length)
else:
print('I: Probe sequences have the uniform length of %i nucleotides' %
(dfprobes['seq_length'].median()))
dfprobes['padded_sequence'] = dfprobes['seq']
print('I: Total datasets contains %i sequences.' % len(dfprobes))
# visualize composition of each position
df_nucleotides = mf.split_sequence_in_nucleotides(dfprobes['seq'])
dfcount = pd.DataFrame(index=['A', 'C', 'G', 'T', 'U'])
for column in df_nucleotides:
dfcount[column] = df_nucleotides[column].value_counts()
dfcount = dfcount.fillna(0) #zeros for NaN
dfcount.transpose().plot(figsize=(15, 5), kind='bar')
print('I: Visualisation of the base composition per position')
print(
'I: If positions are invariant they can be removed before sequence analysis.'
)
I: Probes length is not uniform, detected range: 36 ..40 I: Probe sequences have been padded at the 5' to the uniform length of 40 nucleotides I: Total datasets contains 40330 sequences. I: Visualisation of the base composition per position I: If positions are invariant they can be removed before sequence analysis.
# You may remove invariant continuos positions by adjusting the slicing.
# It is recommended to leave a few invariant positions to allow for binding events
# between the variable and constant part of the probes.
dfprobes['seq'] = dfprobes['seq'].apply(lambda s: s[:40]) ### <==== do the slicing here
# visualize composition of each position
print('I: Visualisation of the base composition per position after slicing.')
df_nucleotides = mf.split_sequence_in_nucleotides(dfprobes['seq'])
dfcount = pd.DataFrame(index=['A', 'C', 'G', 'T', 'U'])
for column in df_nucleotides:
dfcount[column] = df_nucleotides[column].value_counts()
dfcount = dfcount.fillna(0) #zeros for NaN
dfcount.transpose().plot(figsize=(15, 5), kind='bar')
plt.show()
# preparation for later classification
mean = dfprobes['signal binding'].mean()
std = dfprobes['signal binding'].std()
THRESHOLD = mean + 4 * std #4*std used according to Weirauch et al., 2013
dfprobes['positive probe'] = dfprobes['signal binding'].apply(
lambda s: True if s > THRESHOLD else False)
print(
'I: The whole dataset has been used to set the threshold for a positive probe.'
)
print('I: The threshold is %f' % THRESHOLD)
print(
f"I: {len(dfprobes[dfprobes['positive probe']])} probes of {len(dfprobes)} are above threshold."
)
if len(dfprobes[dfprobes['positive probe']]) == 0:
print(
'E: No probe above THRESHOLD. Classification is not possible. Please adjust the THRESHOLD.'
)
I: Visualisation of the base composition per position after slicing.
I: The whole dataset has been used to set the threshold for a positive probe. I: The threshold is 23991.450195 I: 524 probes of 40330 are above threshold.
### Shuffle and prepare dataset for training and testing
# shuffle and split
dfprobes = shuffle(dfprobes)
dftrain, dftest = train_test_split(dfprobes, test_size=0.2)
print(
'I: The whole dataset has been split in training (80%) and test (20%) datasets.'
)
# display histogramms of test and training set
dftrain['signal binding'].plot(kind='hist', bins=25).axvline(x=THRESHOLD, color='r', linestyle='-.', lw=0.5, label='threshold classification')
dftest['signal binding'].plot(kind='hist', bins=25)
plt.show()
# generate a subset with maximal 1000 probes
downsampled_size = 1000 # You may change downsampled size here.
percentile = 0.5 * downsampled_size / len(
dftrain
) * 100 #percentile required for lowest and highest to achieve down-sampled size
if percentile < 4:
percentile = 4 #do not use only the extreme values
elif percentile > 10:
percentile = 10 #avoid taking value from the mid-range
if len(dftrain) * percentile * 2 / 100 < downsampled_size / 4:
print('W: The subset only contains %i probes - a rather low number.' %
dftrain * percentile * 2 / 100)
print(
'I: A downsampled dataset containing the lowest and highest %.1f %% of the dataset is generated.'
% percentile)
dfsubset_high = dftrain[dftrain['signal binding'] >= dftrain['signal binding'].quantile(1 - percentile / 100)] # highest part
dfsubset_low = dftrain[dftrain['signal binding'] <= dftrain['signal binding'].quantile(percentile / 100)] # lowest part
print('I: Median values of lowest and highest %.1f %%: %r %r' %
(percentile, dfsubset_low['signal binding'].quantile(0.5),
dfsubset_high['signal binding'].quantile(0.5)))
if len(dfsubset_high) + len(dfsubset_low) > downsampled_size:
print('I: The dataset is further downsampled to %i sequences.' %
downsampled_size)
dfsubset_high = dfsubset_high.sample(downsampled_size - int(downsampled_size / 2))
dfsubset_low = dfsubset_low.sample(int(downsampled_size / 2))
dfsubset = pd.concat([dfsubset_high, dfsubset_low])
else:
dfsubset = pd.concat([dfsubset_high, dfsubset_low])
dfsubset = shuffle(dfsubset)
# display main properties of downsampled data set
print('I: Histogramm of the downsampled dataset along the with classification threshold.')
dfsubset['signal binding'].plot(kind='hist', bins=25).axvline(x=THRESHOLD, color='r', linestyle='-.', lw=0.5, label='threshold classification')
plt.show()
# establish numpy arrays of the sequenc and binding data in the dataframes
# complete data
X=mf.hotencode_sequence(dfprobes['padded_sequence'], nuc_type=NUC_TYPE)
y=np.array(dfprobes['signal binding'])
# training set
X_train=mf.hotencode_sequence(dftrain['padded_sequence'], nuc_type=NUC_TYPE)
y_train=np.array(dftrain['signal binding'])
# subset of training set
X_subset=mf.hotencode_sequence(dfsubset['padded_sequence'], nuc_type=NUC_TYPE)
y_subset=np.array(dfsubset['signal binding'])
# test set
X_test=mf.hotencode_sequence(dftest['padded_sequence'], nuc_type=NUC_TYPE)
y_test=np.array(dftest['signal binding'])
I: The whole dataset has been split in training (80%) and test (20%) datasets.
I: A downsampled dataset containing the lowest and highest 4.0 % of the dataset is generated. I: Median values of lowest and highest 4.0 %: 1064.994384765625 16825.373046875 I: The dataset is further downsampled to 1000 sequences. I: Histogramm of the downsampled dataset along the with classification threshold.
### perform a quick & dirty round for a short motif by fitting on subset to check data integrity
#fit regression quick_model
quick_model=mf.findmotif(motif_length=3, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS, ftol=0.01)
start = time()
quick_model.fit(X_subset,y_subset)
print("I: Optimization took %.2f hours." % ((time() - start)/3600))
# print & display main results
quick_model.analyse_motif(X_subset,y_subset, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('quick', quick_model)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
WARNING:matplotlib.font_manager:findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
I: Optimization took 0.05 hours. I: energy matrix and logos:
A C G T
0 -1722 5146 -5630 2206
1 16314 5041 -4952 -16403
2 7482 -739 -5187 -1555
I: summed absolute energies of each position:
0 14705
1 42712
2 14964
dtype: int64
I: averaged summed energy over all positions: 24127
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -6069 +/- 14080
I: Plot of the Occupancy of a subsite as the function of the Free Energy G
overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe: binding: 0.08408 .. 2.54079 (ratio: 30.2) I: number of probes: 1000 I: Pearson Correlation r: 0.6651 I: mean absolute error: 7692.7441
I: Classification performance AUROC: 0.8283
| stage | protein | # probes | motif length | r | AUROC | G0 | G0 fitted | ratio | max binding | min binding | energies | model | logo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | quick | Mlx | 1000 | 3 | 0.665118 | 0.828279 | -11473.366709 | False | 30.220111 | 2.54079 | 0.084076 | -1722,.. | suppressed |
quick_model.explore_positions(X_train, y_train)
| pos | energies | r | r under baseline | -2% | |
|---|---|---|---|---|---|
| 0 | 0 | [-0.0, 0.0, -0.0, 0.0, 16314.725740130505, 504... | 0.184025 | -0.481093 | True |
| 1 | 1 | [-1722.0232094965108, 5146.111079001937, -5630... | 0.041719 | -0.623399 | True |
| 2 | 2 | [-1722.0232094965108, 5146.111079001937, -5630... | 0.213306 | -0.451812 | True |
#### Perfrom GridCV Search for exploration of the motif length goal: identify the minimum motif length which gives a good r-value
# optional: allow for global optimization to verify whether the local optimization is good enough
# not recommended include fitG0=True. This option should only be considered when the local optimization is started with an approximate motif and the start parameter is set
# not recommended set time_dissociation. The effect of dissociation should be only considered when the local optimization is started with an approximate motif.
# prepare grid search over motif_length
model_grid=mf.findmotif(protein_conc=PROT_CONC, both_strands=BOTH_STRANDS)
param_grid = {"motif_length": [3,4,5,6,7]} # choose sensible range for length of motif
# define custom refit function
def custom_refit(cv_results):
"""returns index of max r2/sqrt(motif_length)"""
df_grid=pd.DataFrame(cv_results)
index=(df_grid['mean_test_score']/(df_grid['param_motif_length'].apply(float).apply(np.sqrt))).idxmax()
return index
# run grid search and refit according to custom refit
grid_search = GridSearchCV(model_grid, param_grid=param_grid, verbose=2, cv=5, refit=custom_refit, n_jobs=-1)
start = time()
grid_search.fit(X_subset, y_subset)
print("I: GridSearchCV took %.2f hours for %d candidate parameter settings."
% ((time() - start)/3600, len(grid_search.cv_results_["params"])))
print('I: number of samples: %i' %len(X_subset))
df_grid=pd.DataFrame(grid_search.cv_results_)
print('I: Plot of r2 vs motif length and vs root(motif length)')
df_grid.rename(columns={'mean_test_score':'r2'}, inplace=True)
df_grid.plot(kind='scatter', x='param_motif_length', y='r2', yerr='std_test_score', figsize=(5,3)).set_xticks(param_grid["motif_length"])
df_grid['r2/sqrt(length)']=df_grid['r2']/(df_grid['param_motif_length'].apply(float).apply(np.sqrt))
df_grid['std/sqrt(length)']=df_grid['std_test_score']/(df_grid['param_motif_length'].apply(float).apply(np.sqrt))
df_grid.plot(kind='scatter', x='param_motif_length', y='r2/sqrt(length)',yerr='std/sqrt(length)', figsize=(5,3)).set_xticks(param_grid["motif_length"])
plt.show()
best_index=df_grid['r2/sqrt(length)'].idxmax()
CORE_MOTIF_LENGTH=df_grid.loc[best_index, 'param_motif_length']
print(f'I: The maximum ({CORE_MOTIF_LENGTH}) is suggested as CORE_MOTIF_LENGTH')
print('I: motif obtained with the best estimator from gridCV search')
# print & display results from best estimator
model_grid=grid_search.best_estimator_
model_grid.analyse_motif(X_subset,y_subset, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('best grid', model_grid)
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
Fitting 5 folds for each of 5 candidates, totalling 25 fits I: GridSearchCV took 0.86 hours for 5 candidate parameter settings. I: number of samples: 1000 I: Plot of r2 vs motif length and vs root(motif length)
I: The maximum (6) is suggested as CORE_MOTIF_LENGTH I: motif obtained with the best estimator from gridCV search I: energy matrix and logos:
A C G T
0 2367 -8160 2460 3332
1 -2164 3681 -8941 7423
2 11906 2282 548 -14737
3 1648 4195 -9490 3646
4 802 1154 1269 -3225
5 1001 39 -95 -946
I: summed absolute energies of each position:
0 16320
1 22211
2 29474
3 18980
4 6451
5 2083
dtype: int64
I: averaged summed energy over all positions: 15920
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -1372 +/- 13555
I: Plot of the Occupancy of a subsite as the function of the Free Energy G
overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe: binding: 0.00141 .. 3.19934 (ratio: 2264.7) I: number of probes: 1000 I: Pearson Correlation r: 0.8042 I: mean absolute error: 5358.4744
I: Classification performance AUROC: 0.9031
| stage | protein | # probes | motif length | r | AUROC | G0 | G0 fitted | ratio | max binding | min binding | energies | model | logo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | quick | Mlx | 1000 | 3 | 0.665118 | 0.828279 | -11473.366709 | False | 30.220111 | 2.540790 | 0.084076 | -1722,.. | suppressed | |
| 1 | best grid | Mlx | 1000 | 6 | 0.804184 | 0.903057 | -1946.454775 | False | 2264.716271 | 3.199344 | 0.001413 | 2367,.. | suppressed |
### run a number of identical optimizations with motif length found during grid search
### goal: find best motif through repetition, judge stabiltiy of optimization
#CORE_MOTIF_LENGTH=5 # adjust core motif length if needed, motif length can be changed later
# prepare for ipyparallel
number_of_optimizations = 20
model_list = [mf.findmotif(motif_length=CORE_MOTIF_LENGTH, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS)] * number_of_optimizations
X_list = [X_subset] * number_of_optimizations
y_list = [y_subset] * number_of_optimizations
def single_job(model, X, y):
model.fit(X, y)
return {'model':model}
# run the optimizations on ipp.cluster
start = time()
with ipp.Cluster(log_level=40) as rc:
rc[:].use_pickle()
view = rc.load_balanced_view()
asyncresult = view.map_async(single_job, model_list, X_list, y_list)
asyncresult.wait_interactive()
result = asyncresult.get()
print("I: Optimization took %.2f hours." % ((time() - start) / 3600))
# assemble results and analyze
df_repetitions=pd.DataFrame(result)
df_repetitions['r (subset)']=df_repetitions['model'].apply(lambda e: e.rvalue)
df_repetitions['r (train)']=df_repetitions['model'].apply(lambda e: mf.linregress(e.predict(X_train),y_train).rvalue)
df_repetitions['r (test)']=df_repetitions['model'].apply(lambda e: mf.linregress(e.predict(X_test),y_test).rvalue)
df_repetitions['G0']=df_repetitions['model'].apply(lambda e: e.finalG0_)
df_repetitions['max binding']=df_repetitions['model'].apply(lambda e: e.max_binding_fit)
df_repetitions['min binding']=df_repetitions['model'].apply(lambda e: e.min_binding_fit)
df_repetitions['ratio'] = df_repetitions['model'].apply(lambda e: e.ratio)
df_repetitions['energies']=df_repetitions['model'].apply(lambda e: e.energies_)
#df_repetitions['information']=df_repetitions['model'].apply(lambda e: mf.energies2information(e.energies_))
# display results of the ensemble of optimizations
print('I: Results of the repeated motif finding, sorted according to the regression coefficient with the train dataset')
df_repetitions.sort_values('r (train)', ascending=False, inplace=True)
mf.display_df(df_repetitions, nuc_type=NUC_TYPE)
0%| | 0/16 [00:00<?, ?engine/s]
single_job: 0%| | 0/20 [00:00<?, ?tasks/s]
I: Optimization took 1.03 hours. I: Results of the repeated motif finding, sorted according to the regression coefficient with the train dataset
| model | r (subset) | r (train) | r (test) | G0 | max binding | min binding | ratio | energies | logo | |
|---|---|---|---|---|---|---|---|---|---|---|
| 7 | suppressed | 0.890235 | 0.732795 | 0.732571 | -1946.454775 | 0.172153 | 0.001413 | 121.794730 | -3592,.. | |
| 16 | suppressed | 0.889981 | 0.728138 | 0.726752 | -1946.454775 | 0.022200 | 0.000176 | 125.990337 | -841,.. | |
| 17 | suppressed | 0.888720 | 0.727082 | 0.727752 | -1946.454775 | 1.412396 | 0.002897 | 487.528460 | 267,.. | |
| 4 | suppressed | 0.887252 | 0.723821 | 0.724293 | -1946.454775 | 1.370802 | 0.005552 | 246.880343 | -6852,.. | |
| 12 | suppressed | 0.868733 | 0.611553 | 0.624722 | -1946.454775 | 2.853511 | 0.019868 | 143.626054 | -4857,.. | |
| 3 | suppressed | 0.868782 | 0.611051 | 0.624704 | -1946.454775 | 2.935046 | 0.018557 | 158.165610 | -5107,.. | |
| 10 | suppressed | 0.866805 | 0.609516 | 0.622133 | -1946.454775 | 2.831423 | 0.014889 | 190.171324 | -4950,.. | |
| 2 | suppressed | 0.866651 | 0.606835 | 0.620001 | -1946.454775 | 3.001508 | 0.017313 | 173.366242 | -4981,.. | |
| 18 | suppressed | 0.859843 | 0.603754 | 0.616578 | -1946.454775 | 2.779731 | 0.003339 | 832.585500 | 443,.. | |
| 8 | suppressed | 0.865081 | 0.602153 | 0.616200 | -1946.454775 | 2.761261 | 0.005642 | 489.396493 | 179,.. | |
| 19 | suppressed | 0.855076 | 0.600044 | 0.609233 | -1946.454775 | 2.750740 | 0.050765 | 54.185733 | -9490,.. | |
| 14 | suppressed | 0.804147 | 0.492140 | 0.501041 | -1946.454775 | 3.265180 | 0.001223 | 2669.669826 | 2621,.. | |
| 1 | suppressed | 0.804646 | 0.491672 | 0.500635 | -1946.454775 | 3.310548 | 0.001151 | 2877.189644 | 2596,.. | |
| 9 | suppressed | 0.803433 | 0.491532 | 0.500840 | -1946.454775 | 3.162870 | 0.001296 | 2441.333598 | 2188,.. | |
| 5 | suppressed | 0.803785 | 0.491476 | 0.500087 | -1946.454775 | 3.193325 | 0.001248 | 2558.910713 | 1710,.. | |
| 15 | suppressed | 0.802353 | 0.490766 | 0.501396 | -1946.454775 | 3.265955 | 0.001267 | 2577.979257 | 10698,.. | |
| 0 | suppressed | 0.645968 | 0.320167 | 0.310050 | -1946.454775 | 0.092373 | 0.002677 | 34.505616 | -3483,.. | |
| 6 | suppressed | 0.677981 | 0.313893 | 0.311648 | -1946.454775 | 3.318715 | 0.004870 | 681.456664 | 443,.. | |
| 13 | suppressed | 0.681568 | 0.311863 | 0.308358 | -1946.454775 | 2.771487 | 0.005565 | 498.020474 | -6381,.. | |
| 11 | suppressed | 0.602044 | 0.276270 | 0.299125 | -1946.454775 | 3.731195 | 0.167081 | 22.331606 | -11311,.. |
### compare energy matrices of ensemble using PCA
print('I: Histogram of the regression coefficients r obtained by repeated optimizaion with the subset.')
df_repetitions['r (subset)'].plot(kind='hist')
plt.show()
pca = PCA(n_components=2)
pca_2c=pca.fit_transform(df_repetitions['energies'].tolist())
df_repetitions[['PCA1', 'PCA2']]=pca_2c
print('I: 2-dimensional PCA explained %i %% of variance.' %(sum(pca.explained_variance_ratio_)*100))
if sum(pca.explained_variance_ratio_)<0.5:
print('W: 2-dimensional PCA explained only %i %% of variance' %(sum(pca.explained_variance_ratio_)*100))
print('I: Visualization of the PCA with the regression quality vs. subset and training dataset by color.')
df_repetitions.plot(x='PCA1', y='PCA2', kind='scatter', c='r (subset)',cmap=cm.coolwarm, edgecolors='black', linewidths=0.3)
df_repetitions.plot(x='PCA1', y='PCA2', kind='scatter', c='r (train)',cmap=cm.coolwarm, edgecolors='black', linewidths=0.3)
I: Histogram of the regression coefficients r obtained by repeated optimizaion with the subset.
I: 2-dimensional PCA explained 62 % of variance. I: Visualization of the PCA with the regression quality vs. subset and training dataset by color.
/home/GLipps/.local/lib/python3.8/site-packages/sklearn/utils/deprecation.py:101: FutureWarning: Attribute `n_features_` was deprecated in version 1.2 and will be removed in 1.4. Use `n_features_in_` instead. warnings.warn(msg, category=FutureWarning)
<matplotlib.axes._subplots.AxesSubplot at 0x7f698d609ee0>
# visualisation of the motif with the highest r with the train dataset
print('I: Best motif according to r (train) from the repeated optimizations.')
print('I: PCA components: %i, %i' %(df_repetitions.iloc[0]['PCA1'], df_repetitions.iloc[0]['PCA2']))
model_best_repetition=df_repetitions.iloc[0]['model']
model_best_repetition.analyse_motif(X_subset,y_subset, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('best repetition', model_best_repetition, new_entries={'r (test)': mf.linregress(model_best_repetition.predict(X_test),y_test).rvalue})
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
I: Best motif according to r (train) from the repeated optimizations. I: PCA components: -19629, -10097 I: energy matrix and logos:
A C G T
0 -3592 -8307 16172 -4272
1 -11035 -3042 -3116 17194
2 4214 -4730 2665 -2149
3 -689 1418 -3579 2851
4 360 939 -57 -1242
5 2109 810 -2257 -662
I: summed absolute energies of each position:
0 32345
1 34389
2 13760
3 8538
4 2599
5 5839
dtype: int64
I: averaged summed energy over all positions: 16245
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -1251 +/- 14924
I: Plot of the Occupancy of a subsite as the function of the Free Energy G
overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe: binding: 0.00141 .. 0.17215 (ratio: 121.8) I: number of probes: 1000 I: Pearson Correlation r: 0.8902 I: mean absolute error: 4098.7474
I: Classification performance AUROC: 0.9316
| stage | protein | # probes | motif length | r | AUROC | G0 | G0 fitted | ratio | max binding | min binding | energies | model | logo | r (test) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | quick | Mlx | 1000 | 3 | 0.665118 | 0.828279 | -11473.366709 | False | 30.220111 | 2.540790 | 0.084076 | -1722,.. | suppressed | NaN | |
| 1 | best grid | Mlx | 1000 | 6 | 0.804184 | 0.903057 | -1946.454775 | False | 2264.716271 | 3.199344 | 0.001413 | 2367,.. | suppressed | NaN | |
| 2 | best repetition | Mlx | 1000 | 6 | 0.890235 | 0.931599 | -1946.454775 | False | 121.794730 | 0.172153 | 0.001413 | -3592,.. | suppressed | 0.732571 |
### motif finding on complete training dataset starting with best motif from repetitions
#fit & predict optimization starting with previous energy matrix
model_train=mf.findmotif(motif_length=CORE_MOTIF_LENGTH, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
start=model_best_repetition.energies_)
start = time()
model_train.fit(X_train,y_train)
print("I: Optimization took %.2f hours." % ((time() - start)/3600))
# print & display main results
model_train.analyse_motif(X_train,y_train, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('train dataset', model_train, new_entries={'r (test)': mf.linregress(model_train.predict(X_test),y_test).rvalue})
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
I: Optimization took 7.41 hours. I: energy matrix and logos:
A C G T
0 -836 -8045 6487 2394
1 -8420 2590 2697 3132
2 3329 -5229 4091 -2190
3 -1975 4342 -6715 4347
4 833 511 1285 -2630
5 1395 2287 -3832 150
I: summed absolute energies of each position:
0 17765
1 16840
2 14841
3 17381
4 5260
5 7665
dtype: int64
I: averaged summed energy over all positions: 13292
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: -872 +/- 9975
I: Plot of the Occupancy of a subsite as the function of the Free Energy G
overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe: binding: 0.00024 .. 0.56984 (ratio: 2338.7) I: number of probes: 32264 I: Pearson Correlation r: 0.8077 I: mean absolute error: 1562.0631
I: Classification performance AUROC: 0.9813
| stage | protein | # probes | motif length | r | AUROC | G0 | G0 fitted | ratio | max binding | min binding | energies | model | logo | r (test) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | quick | Mlx | 1000 | 3 | 0.665118 | 0.828279 | -11473.366709 | False | 30.220111 | 2.540790 | 0.084076 | -1722,.. | suppressed | NaN | |
| 1 | best grid | Mlx | 1000 | 6 | 0.804184 | 0.903057 | -1946.454775 | False | 2264.716271 | 3.199344 | 0.001413 | 2367,.. | suppressed | NaN | |
| 2 | best repetition | Mlx | 1000 | 6 | 0.890235 | 0.931599 | -1946.454775 | False | 121.794730 | 0.172153 | 0.001413 | -3592,.. | suppressed | 0.732571 | |
| 3 | train dataset | Mlx | 32264 | 6 | 0.807686 | 0.981296 | -1946.454775 | False | 2338.717644 | 0.569843 | 0.000244 | -836,.. | suppressed | 0.805514 |
### Based on the motif of CORE_MOTIF_LENGTH analyze the neigbouring positions
### whether their inclusion can improve the quality of the motif
df_positions=model_train.investigate_extension_parallel(X_train,y_train, end5=3, end3=3, nuc_type=NUC_TYPE)
list_positions=df_positions.index[df_positions['+2%']].tolist()+[0] # list of positions with an increase of2% and default position 0
ext5=-min(list_positions)
ext3=max(list_positions)
print("I: It is suggested to extend the core motif at the 5' end by %i and at the 3' end by %i positions." %(ext5, ext3))
### Analyze model whether the estimated G0 is correct
#df_G0=model_train.investigate_G0(X_train,y_train)
0%| | 0/6 [00:00<?, ?engine/s]
job5: 0%| | 0/3 [00:00<?, ?tasks/s]
job3: 0%| | 0/3 [00:00<?, ?tasks/s]
I: Optimization took 0.27 hours.
I: It is suggested to extend the core motif at the 5' end by 0 and at the 3' end by 0 positions. I: Current G0 = -1946 J/mol (see red broken line in figure below) with r = 0.808. I: Maximal r is 0.808 at G0=-1946 J/mol (see green broken line below). I: Maximal occupancy of 2 is reached at G0=-7946 J/mol (see blue broken line below). I: Maximal occupancy of 0.2 is reached at G0=1054 J/mol (see blue broken line below).
I: G0 is in a range leading to maximal probe occupancy between 0.2 and 2. Good. I: Maximal r is close to r achieved with current G0. Good.
### fit & predict optimization starting with extended energy matrix if extension appears to improve prediction
if ext5+ext3!=0: #extension suggestion from previous analysis of the bordering positions
expanded_energies=model_train.energies_
# append energies of single-optimized bordering positions to energies of central part
if ext5!=0:
energies_5=np.concatenate(df_positions['energies'][(df_positions.index<0) & (df_positions.index>=-ext5)].to_numpy())
expanded_energies=np.concatenate((energies_5, expanded_energies))
if ext3!=0:
energies_3=np.concatenate(df_positions['energies'][(df_positions.index<=ext3) & (df_positions.index>0)].to_numpy().flatten())
expanded_energies=np.concatenate((expanded_energies, energies_3))
mf.energies2logo(expanded_energies, nuc_type=NUC_TYPE)
print('I: Optimization started with following extended motif.')
expanded_motif_length=len(expanded_energies)//4
model_extended=mf.findmotif(motif_length=expanded_motif_length, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
start=expanded_energies)
start = time()
model_extended.fit(X_train,y_train)
print("Optimization took %.2f hours." % ((time() - start)/3600))
# print & display main results
model_extended.analyse_motif(X_train,y_train, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('train, extended', model_extended, new_entries={'r (test)': mf.linregress(model_extended.predict(X_test),y_test).rvalue})
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
else:
print('I: Motif is not extended based on previous analysis.')
I: Motif is not extended based on previous analysis.
### fit & predict optimization starting with extended energy matrix plus one bordering position on each side if current bordering position exceed the information of 0.25
last_model=STAGES.df.at[max(STAGES.df.index),'model']
I_5=mf.energies2information(model_extended.energies_[0:4])>=0.25 #sufficient information content of 5' end position
I_3=mf.energies2information(model_extended.energies_[-4:])>=0.25 #sufficient information content of 3' end position
if I_5 or I_3:
print('I: At least one of the bordering positions of the current motif has an information content of at least 0.25. Extending.')
expanded_energies_with_border=mf.modify_energies(last_model.energies_, end5=I_5, end3=I_3)
mf.energies2logo(expanded_energies_with_border, nuc_type=NUC_TYPE)
motif_length_with_border=len(expanded_energies_with_border)//4
model_with_border=mf.findmotif(motif_length=motif_length_with_border, protein_conc=PROT_CONC, both_strands=BOTH_STRANDS,
start=expanded_energies_with_border)
start = time()
model_with_border.fit(X_train,y_train)
print("Optimization took %.2f hours." % ((time() - start)/3600))
# print & display main results
model_with_border.analyse_motif(X_train,y_train, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('train, expanded, border', model_with_border, new_entries={'r (test)': mf.linregress(model_with_border.predict(X_test),y_test).rvalue})
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
else:
print('I: Both bordering positions of the found motif have an information content below 0.25. No futher optimization required.')
I: At least one of the bordering positions of the current motif has an information content of at least 0.25. Extending.
Optimization took 13.82 hours. I: energy matrix and logos:
A C G T
0 459 692 142 -1294
1 5204 -12203 5272 1726
2 -8878 -1333 5133 5078
3 4942 -6978 4403 -2368
4 -1059 3959 -6608 3708
5 857 510 2563 -3931
6 2011 3067 -4934 -144
7 419 -532 -433 546
I: summed absolute energies of each position:
0 2589
1 24407
2 20423
3 18693
4 15335
5 7863
6 10156
7 1932
dtype: int64
I: averaged summed energy over all positions: 12675
I: Mean and Standard Deviation for the Free Energy G to all subsequences of all probes: 1634 +/- 12419
I: Plot of the Occupancy of a subsite as the function of the Free Energy G
overlaid with the distribution of the Free Energy of all subsites.
I: There shall be only a small overlap of both curves. i.e. only the most negative Free Energies
lead to a measurable occupancy.
I: Calculated occupancy over all subsite of a single probe: binding: 0.00004 .. 1.29615 (ratio: 32369.3) I: number of probes: 32264 I: Pearson Correlation r: 0.8314 I: mean absolute error: 1542.7390
I: Classification performance AUROC: 0.9889
| pos | energies | r | r under baseline | -2% | |
|---|---|---|---|---|---|
| 0 | 0 | [0.0, 0.0, 0.0, -0.0, 5204.1920833228105, -122... | 0.822340 | -0.009062 | False |
| 1 | 1 | [459.87578237203417, 692.1935600187725, 142.71... | 0.557521 | -0.273880 | True |
| 2 | 2 | [459.87578237203417, 692.1935600187725, 142.71... | 0.607805 | -0.223596 | True |
| 3 | 3 | [459.87578237203417, 692.1935600187725, 142.71... | 0.507975 | -0.323426 | True |
| 4 | 4 | [459.87578237203417, 692.1935600187725, 142.71... | 0.505053 | -0.326349 | True |
| 5 | 5 | [459.87578237203417, 692.1935600187725, 142.71... | 0.687031 | -0.144370 | True |
| 6 | 6 | [459.87578237203417, 692.1935600187725, 142.71... | 0.613651 | -0.217751 | True |
| 7 | 7 | [459.87578237203417, 692.1935600187725, 142.71... | 0.828276 | -0.003126 | False |
df_relevant_positions=last_model.explore_positions(X_train, y_train)
list_positions=df_relevant_positions.index[df_relevant_positions['-2%']].tolist() # list of positions with an increase of2% and default position 0
start_relevant=min(list_positions)
end_relevant=max(list_positions)
red5=-start_relevant
red3=end_relevant-len(df_relevant_positions)+1
print('I: The analysis suggests, that positions between %i to %i contribute significantly to the motif' %(start_relevant, end_relevant))
last_model=STAGES.df.at[max(STAGES.df.index),'model']
if (end_relevant-start_relevant+1)in STAGES.df['motif length'].to_list():
print('I: No need for a further optimization. An optimization with motif length of %i has already been done.' %(end_relevant-start_relevant))
print('I: Checking whether G0 has been chosen correctly.')
last_model.investigate_G0(X_train, y_train)
else:
print('I: Bordering positions only marginally contributing towards regression quality are dropped.')
print('I: New start energy for motif optimization:')
start_final_model=mf.modify_energies(last_model.energies_, end5=red5, end3=red3)
mf.energies2logo(start_final_model, nuc_type=NUC_TYPE)
final_model=mf.findmotif(motif_length=len(start_final_model)//4, protein_conc=PROT_CONC,
both_strands=BOTH_STRANDS, start=start_final_model)
start = time()
final_model.fit(X_train,y_train)
print("Optimization took %.2f hours." % ((time() - start)/3600))
# print & display main results
final_model.analyse_motif(X_train,y_train, THRESHOLD, nuc_type=NUC_TYPE)
print('I: Checking whether G0 has been chosen correctly.')
final_model.investigate_G0(X_train, y_train)
# store results and display stages
STAGES.append('train, shrinked', final_model, new_entries={'r (test)': mf.linregress(final_model.predict(X_test),y_test).rvalue})
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
I: The analysis suggests, that only positions between 0 to 5 contribute significantly to the motif I: Bordering positions only marginally contributing towards regression quality are dropped. I: New start energy for motif optimization:
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) <ipython-input-75-d6e21e4e6261> in <module> 21 22 start = time() ---> 23 final_model.fit(X_train,y_train) 24 print("Optimization took %.2f hours." % ((time() - start)/3600)) 25 ~/fhnw/python/T7 primase site/RNAcompete/motif.py in fit(self, X, y) 560 res = optimize.dual_annealing(target, bounds) 561 else: --> 562 res = minimize(target, start, method='Powell',tol=None, callback=None, options=options, bounds=bounds) 563 564 # finalize with results of optimization ~/.local/lib/python3.8/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options) 688 **options) 689 elif meth == 'powell': --> 690 res = _minimize_powell(fun, x0, args, callback, bounds, **options) 691 elif meth == 'cg': 692 res = _minimize_cg(fun, x0, args, jac, callback, **options) ~/.local/lib/python3.8/site-packages/scipy/optimize/_optimize.py in _minimize_powell(func, x0, args, callback, bounds, xtol, ftol, maxiter, maxfev, disp, direc, return_all, **unknown_options) 3180 direc1 = direc[i] 3181 fx2 = fval -> 3182 fval, x, direc1 = _linesearch_powell(func, x, direc1, 3183 tol=xtol * 100, 3184 lower_bound=lower_bound, ~/.local/lib/python3.8/site-packages/scipy/optimize/_optimize.py in _linesearch_powell(func, p, xi, tol, lower_bound, upper_bound, fval) 2918 elif not np.isneginf(bound[0]) and not np.isposinf(bound[1]): 2919 # we can use a bounded scalar minimization -> 2920 res = _minimize_scalar_bounded(myfunc, bound, xatol=tol / 100) 2921 xi = res.x * xi 2922 return squeeze(res.fun), p + xi, xi ~/.local/lib/python3.8/site-packages/scipy/optimize/_optimize.py in _minimize_scalar_bounded(func, bounds, args, xatol, maxiter, disp, **unknown_options) 2162 si = np.sign(rat) + (rat == 0) 2163 x = xf + si * np.maximum(np.abs(rat), tol1) -> 2164 fu = func(x, *args) 2165 num += 1 2166 fmin_data = (num, x, fu) ~/.local/lib/python3.8/site-packages/scipy/optimize/_optimize.py in myfunc(alpha) 2901 """ 2902 def myfunc(alpha): -> 2903 return func(p + alpha*xi) 2904 2905 # if xi is zero, then don't optimize ~/.local/lib/python3.8/site-packages/scipy/optimize/_optimize.py in function_wrapper(x, *wrapper_args) 494 ncalls[0] += 1 495 # A copy of x is sent to the user function (gh13740) --> 496 fx = function(np.copy(x), *(wrapper_args + args)) 497 # Ideally, we'd like to a have a true scalar returned from f(x). For 498 # backwards-compatibility, also allow np.array([1.3]), ~/fhnw/python/T7 primase site/RNAcompete/motif.py in target(energies_ACG) 526 def target(energies_ACG): 527 energies=acg2acgt(energies_ACG) --> 528 binding=self.calculate_binding(sub,G0,energies) 529 r=linregress(y, binding).rvalue #regression (shall be positive) 530 penalty_base=np.sum(np.maximum(abs(energies)- self.threshold_base, 0)**2)*self.motif_length*self.penalty_base #penalty if energy of a base is too high ~/fhnw/python/T7 primase site/RNAcompete/motif.py in calculate_binding(self, sub, G0, energies) 446 #this is the calculation for the forward probe sequence 447 G=sub*energies #calculate energies for each base of all subsequences --> 448 Gsub=np.apply_along_axis(sum, 2, G)+G0 #sum-up over all positions of a subsequence for each subsequence and add G0 449 f=lambda G: self.protein_conc/(self.protein_conc+(1/np.exp(-G/8.31/298)*1E6)) * np.exp(-self.time_dissociation*1/np.exp(-G/8.31/298)*self.kon) 450 # (1/np.exp(-dG/R/T)*1E6) equals to Kd [uM] ~/.local/lib/python3.8/site-packages/numpy/core/overrides.py in apply_along_axis(*args, **kwargs) ~/.local/lib/python3.8/site-packages/numpy/lib/shape_base.py in apply_along_axis(func1d, axis, arr, *args, **kwargs) 400 buff[ind0] = res 401 for ind in inds: --> 402 buff[ind] = asanyarray(func1d(inarr_view[ind], *args, **kwargs)) 403 404 if not isinstance(res, matrix): KeyboardInterrupt:
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)
| stage | protein | # probes | motif length | r | AUROC | G0 | G0 fitted | ratio | max binding | min binding | energies | model | logo | r (test) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | quick | Mlx | 1000 | 3 | 0.665118 | 0.828279 | -11473.366709 | False | 30.220111 | 2.540790 | 0.084076 | -1722,.. | suppressed | NaN | |
| 1 | best grid | Mlx | 1000 | 6 | 0.804184 | 0.903057 | -1946.454775 | False | 2264.716271 | 3.199344 | 0.001413 | 2367,.. | suppressed | NaN | |
| 2 | best repetition | Mlx | 1000 | 6 | 0.890235 | 0.931599 | -1946.454775 | False | 121.794730 | 0.172153 | 0.001413 | -3592,.. | suppressed | 0.732571 | |
| 3 | train dataset | Mlx | 32264 | 6 | 0.807686 | 0.981296 | -1946.454775 | False | 2338.717644 | 0.569843 | 0.000244 | -836,.. | suppressed | 0.805514 | |
| 4 | train, expanded, border | Mlx | 32264 | 8 | 0.831401 | 0.988867 | 3085.476013 | False | 32369.323534 | 1.296148 | 0.000040 | 459,.. | suppressed | 0.817456 |
STAGES.df.to_json('%s_%s-%s-%s_%s-%s.json' %(PROTEIN_NAME, datetime.now().year, datetime.now().month,datetime.now().day , datetime.now().hour, datetime.now().minute))
STAGES.df.to_pickle('%s_%s-%s-%s_%s-%s.pkl' %(PROTEIN_NAME, datetime.now().year, datetime.now().month,datetime.now().day , datetime.now().hour, datetime.now().minute))
df_stages.drop(index='best grid fitG0=True', inplace=True)
"""
expanded_energies=mf.modify_energies(model_train.energies_, end5=ext5, end3=ext3) # <==== adjust end5 and end3 to enlarge core motif on 5' and 3' end
mf.energies2logo(expanded_energies, nuc_type=NUC_TYPE)
expanded_motif_length=len(expanded_energies)//4
"""
import importlib
importlib.reload(mf)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
<module 'motif' from '/home/GLipps/fhnw/python/T7 primase site/RNAcompete/motif.py'>
start = time()
model_mae=model_with_border.refit_mae(X,y)
print("Optimization took %.2f hours." % ((time() - start)/3600))
# print & display main results
model_mae.analyse_motif(X_train,y_train, THRESHOLD, nuc_type=NUC_TYPE)
# store results and display stages
STAGES.append('train, expanded, border, mae', model_mae, new_entries={'r (test)': mf.linregress(model_mae.predict(X_test),y_test).rvalue})
mf.display_df(STAGES.df, nuc_type=NUC_TYPE)